cd D:\KRpatent\data\family\string_matching

import delimited matched_uspto_perfect_k2d.csv, delimiter("*") encoding(UTF-8) clear
drop v1
ren src_name standard_name
ren trg_name dgstd 
so assgid
qui by assgid: gen dup = cond(_N==1,0,_n)
qui by assgid: egen mdup = max(dup) 
drop if mdup > 5
drop dup mdup 
sa perfect_temp, replace 

import delimited matched_uspto_scorebased_k2d.csv, delimiter("*") encoding(UTF-8) clear
drop v1
ren src_stem kipstem 
ren trg_stem dgstem
sa scorebased_temp, replace 

import delimited matched_uspto_scorebased_d2k.csv, delimiter("*") encoding(UTF-8) clear
drop v1
ren src_stem dgstem
ren trg_stem kipstem 
merge 1:1 symbol assgid using scorebased_temp 
keep if _m == 3
drop _m 
sa scorebased_temp, replace

use perfect_temp, clear 
gen match_phase = "Fperfect"
append using scorebased_temp
replace match_phase = "Fscore" if match_ph==""
sa temp, replace 

use family_match, clear /* This should come from "2. family" folder */
merge n:1 assgid symbol using temp
format standard stem dgstd dgstem kipstd kipstem %40s
order match relative assgid standard stem symbol dgstd dgstem kipstd kipstem
gsort -relative -sc -rsc
drop if symbol == ""
replace match_ph = "Family" if match_ph == "" | _m == 3
duplicates drop 
sa uspto_matches_temp, replace

* Duplicates * 
use uspto_matches_temp, clear 
* assgid: symbol = 1 : n
** drop B symbol
so assgid
qui by assgid: gen dup = cond(_N==1,0,_n)
gen temp = substr(symbol,1,1)
encode temp, gen(type)
drop temp
qui by assgid: egen mtype = min(type)
keep if mtype == type
drop dup type mtype
** keep family 
so assgid
qui by assgid: gen dup = cond(_N==1,0,_n)
encode match_ph, gen(type)
qui by assgid: egen mtype = min(type)
keep if type == mtype
drop dup type mtype 
** drop if still duplicated 
keep match_ph assgid symbol 
duplicates drop 
so assgid
qui by assgid: gen dup = cond(_N==1,0,_n)
keep if dup == 0 | symbol == "A000150" | symbol == "A004800"
drop dup
* assgid: symbol = n : 1
** harmonize 
so symbol assgid
qui by symbol: egen assgidH = min(assgid)
sa temp, replace 
use temp, clear 
keep assgid assgidH
duplicates drop 
sa assgidH_dict, replace 
use temp, clear
ren match phase
keep assgidH symbol phase
duplicates drop
encode phase, gen(type) 
so symbol
qui by symbol: egen mtype = min(type) 
keep if mtype == type 
drop type mtype 
sa uspto_matches, replace 





